/* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is "com.andreasfink.utils" Java(TM) utilities module. * * The Initial Developer of the Original Code is * Andreas Fink, oss@andreasfink.com. * Portions created by the Initial Developer are Copyright (C) 2010 * the Initial Developer. All Rights Reserved. * * Contributor(s): * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ package com.andreasfink.utils.text; import java.text.Normalizer; import java.util.BitSet; /** * String encoding for SEO purposes, for example beautiful urls or wiki-words. * * @author oss@andreasfink.com */ public class SEOEncoder { private static final BitSet dontReplace; private static final String[] replace; static { dontReplace = new BitSet(256); for (int i = 'a'; i <= 'z'; i++) { dontReplace.set(i); } for (int i = 'A'; i <= 'Z'; i++) { dontReplace.set(i); } for (int i = '0'; i <= '9'; i++) { dontReplace.set(i); } dontReplace.set('-'); dontReplace.set('_'); dontReplace.set('+'); replace = new String[0xFFFF]; // UTF-8/16 assumed as maximum replace[(int) '\u00c4'] = "Ae"; replace[(int) '\u00d6'] = "Oe"; replace[(int) '\u00dc'] = "Ue"; replace[(int) '\u00e4'] = "ae"; replace[(int) '\u00f6'] = "oe"; replace[(int) '\u00fc'] = "ue"; replace[(int) '\u00df'] = "ss"; replace[(int) '&'] = "+"; replace[(int) ' '] = "_"; replace[(int) '/'] = "-"; } private SEOEncoder() {} /** * German Umlauts and Eszett are collated as in phonebooks (e.g.: Ä to Ae).<br> * & is converted to +, whitespace to _, / to -.<br> * All other chars except A-Z, a-z, 0-9, -, _, + are discarded. * * @param text * @return */ public static String encodeGermanFast(final String text) { final StringBuilder out = new StringBuilder(text.length()); for (int i = 0; i < text.length(); i++) { final int c = (int) text.charAt(i); if (dontReplace.get(c)) { out.append((char) c); } else if (replace[c] != null) { out.append(replace[c]); } } return out.toString(); } /** * Replaces all charactars unsuitable for URLs with logical alternatives using <code>java.text.Normalizer</code><br> * * TODO take care of tapestries url-encoding & -> + -> $002b * * @param text * @return */ public static String encodeUnicode(final String text) { final String normalized = Normalizer.normalize(text, Normalizer.Form.NFD); final String withoutDiacritics = normalized.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); final String encoded = withoutDiacritics .replace('&', '+') .replace('/', '-') .replaceAll("[^\\p{Alnum}-\\+]+", "_"); final String trimmed = (encoded.endsWith("_")) ? encoded.substring(0, encoded.length()-1) : encoded; return trimmed; } }